#1.load the data
data<- data.frame( y  = c(81, 55, 80, 24, 78, 52, 88, 45, 50, 69, 66, 45, 24, 43, 38, 72, 41, 38, 52, 52, 66, 89),
                   x1 = c(124, 49, 181, 4, 22, 152, 75, 54, 43, 41, 17, 22, 16, 10, 63, 170, 125, 12, 221, 171, 97, 254),
                   x2 = c(33, 31, 38, 17, 20, 39, 30, 29, 35, 31, 23, 21, 8, 23, 37, 40, 38, 25, 39, 33, 38, 39),
                   x3 = c(8, 6, 8, 2, 4, 6, 7, 7, 6, 5, 4, 3, 3, 3, 6, 8, 6, 4, 7, 7, 6, 8)
                  )
data

y <- data$y
x1<- data$x1
x2<- data$x2
x3<- data$x3

#2.Scatter Plot
par(mar = c(4, 4, 1, 1), mfrow=c(2,2))
plot(x1 , y , xlab="x1" , ylab="y")
plot(x2 , y , xlab="x2" , ylab="y")
plot(x3 , y , xlab="x3" , ylab="y")


#3.regress y on x1 + x2 + x3
model <- lm(y ~ x1 + x2 + x3, data = data)
summary(model)

#4.Estimated values
model$coefficients

#5.C.I for betas: 𝛽 +- t_critical. S𝛽
confint(model, level=0.95)

#Interpretation:In general, the confidence interval shows that if we repeat 
#the test for all possible samples, 95% of the times, the true parameter will be within this interval.
#What we can conclude is that, if an coefficient confident interval includes zero, 
#then we must question the significance of this coefficient.
#so from the results ...

#6.coefficient of determination R_squared = SSR/SST
#Find SSR
ssr <- sum((model$fitted.values - mean(y))^2)

#Find SST
sst <- sum((y - mean(y))^2)

#Find the coefficinet of Determination
r_squared <- ssr / sst
r_squared
#Interpretation:Coefficient of Determination is a ratio of SSR/SST,
# so it measures how much of the variation is our model able to capture and predict.
# so this model explains about ...% of the data

#7.Correlation coefficient r(two by two)
cor(data)

#Interpretation:so there exists a:

#  strong positive relation between dpi & pop75 (r=0.78)
#  strong negative relation between pop15 & pop75 (r=-0.9)
#  strong negative relation between pop75 & dpi (r=-0.75)
#  there exists some relatively weaker relation between sr and other attributes

#8.Hypothesis test for betas using F-ratio
n<- length(y)

#a) H0: 𝛽1 = 0 in the model y = 𝛽0 + 𝛽1 x1 
model1 <- lm(y ~ x1, data = data)
anova(model1)
# f critical ,df1 = p = 1, df2 = n - (p + 1) = 50 - (2) = 48
qf(df1= 1,df2= n - 2,p=.95)
#Interpreration:F = .... > F-critical (also p-value < 0.05) ⇒ reject H0 ⇒ β1 is different than zero : x1is significant.
#or Fvalue=... < Fcritical (also p-value=...>0.05) ⇒ failed to reject H0 ⇒  𝛽2 = 0 : Pop75 non significative

#b) H0: 𝛽2 = 0 in the model y = 𝛽0 + 𝛽1 x1 + 𝛽2 x2 
model2 <- lm(y ~ x1 + x2, data = data)
anova(model2)
# f critical ,df1 = p = 2, df2 = n - (p + 1) = 50 - (3) = 47
qf(df1= 2,df2= n - 3 ,p=.95)
#Interpreration:F = .... > F-critical (also p-value < 0.05) ⇒ reject H0 ⇒ β1 is different than zero : x1is significant.
#or Fvalue=... < Fcritical (also p-value=...>0.05) ⇒ failed to reject H0 ⇒  𝛽2 = 0 : Pop75 non significative

#c) H0: H0:𝛽3=0 in the model y=𝛽0+𝛽1 x1+𝛽2 x2+𝛽3 x3
model3 <- lm(y ~ x1 + x2 + x3, data = LifeCycleSavings)
anova(model3)
# f critical ,df1 = p = 3, df2 = n - (p + 1) = 50 - (4) = 48
qf(df1= 3,df2= n - 4 ,p=.95)
#Interpreration:F = .... > F-critical (also p-value < 0.05) ⇒ reject H0 ⇒ β1 is different than zero : x1is significant.
#or Fvalue=... < Fcritical (also p-value=...>0.05) ⇒ failed to reject H0 ⇒  𝛽2 = 0 : Pop75 non significative


#9.Hypothesis Test on r

#H0: ρ(x,y)=0
#H1: ρ(x,y)≠0

#t-test
test <- cor.test(x1, y)
test

#t-critical
t_critical <- qt(df= n - 2, p=0.025)
t_critical

#Interpretation:t = 13.88 ⇒ |t| > t-critical ⇒ reject H0 ⇒ ρ(x,y) ≠ 0 : There exist a linear relationship between x and y

#10.Residuals
residuals <- model$residuals

#Check the normality of residuals by performing the shapiro test
shapiro.test(residuals)
#Interpretation: W = ... (close to 1) and p = ... > 0.05 then we fail to reject H0.
#This suggests that the residuals are normally distributed

#11.Residuals Graph
plot(residuals, model$fitted.values)

#12. Which ... correspond to the largest and smallest residual.

# which() function in R returns the position or the index of the value which satisfies the given condition

largest_residual_index <- which.max(residuals)
data[largest_residual_index,]
largest_residual_index

smallest_residual_index <- which.min(residuals)
LifeCycleSavings[smallest_residual_index,][0]
smallest_residual_index 

#13.multiple regression matrix :  𝑌 = 𝛽𝑋 + 𝜀
model.matrix(model)

#14.calculate the levers(hii = xi(X'X)^ -1)
leverages <- hatvalues(model)
leverages

#15.Graph the levers 
plot(leverages)

#16. Prove that the sum of the levers is equal to p + 1
sum(leverages)

#17. Give the number of levers which are greater than 2^(𝑝+1/𝑛) Name the ... that correspond to these levers
#N.B: all lever greater than 2^(𝑝+1/𝑛) is suspect
p <- 3
n <- length(leverages)
threshold <- 2^((p+1)/n)
num_greater_threshold <- sum(leverages > threshold)
cat("Number of levers greater than the threshold:", num_greater_threshold)
#Name the ... that correspond to these levers
leverages[which(leverages>threshold)]

#18. internal studentized residuals --> r(i) = ei / (s(i).sqrt(1-hii))
#Calculate the internal studentized residuals
rs <- rstandard(model)
rs

#Find t
t <- qt(df=n-p-1, p= 0.975)
t
# a point is suspect if  |ri| > t , then:
rs[which(rs>t)]

#19. external studentized residuals --> r(-i) = ei / (s(-i).sqrt(1-hii))
#Calculate the internal studentized residuals
rs2 <- rstudent(model)
rs2
# a point is suspect if  |r(-i)| > t , then:
rs2[which(rs2>t)]

#20. Cook's distance: Ci = (1/p) . (hii/1-hii) . (ri)^2
#(i) will be suspect if ci > 1
cd <- cooks.distance(model)
cd[which(cd>1)]

#21. Hypothesis for Homoscedasticity
#H0: Homoscedasticity of errors
#H1: Heteroscedasticity of errors

#install lmtest package
install.packages("lmtest")

# Perform Breusch-Pagan test for homoscedasticity
library(lmtest)
bp_test <- bptest(model)

# Display the test results
print(bp_test)
#Interpretation: p-value > 0.05 ⇒ fail to reject H0 ⇒ it suggests that there 
#is no significant evidence of heteroscedasticity in the residuals.
# In other words, the assumption of constant variance of errors across different levels
#of the independent variables is not violated.

#remedy for the heteroskedasticity problem:One common remedy for heteroscedasticity 
#is to use weighted least squares (WLS) regression.


#Assumptions of regression model:
# .Linearity of the model
# .Independence of errors
# .Homoscedasticity of errors(constant variance)
# .Normality of errors

#Problem1:Non-Linearity of the model --> Remedy: Transformation of Variables

#Problem2: Heteroscedasticity of errors --> Remedy: weighted least squares (WLS) regression.

#Problem3: Outliers --> remove outlier or investigate further